*===============================================================================
*=== FILE 1 --- IMPORT AND CLEAN FOR-SALE ZILLOW INFORMARTION
*===============================================================================
clear
set more off

cap cd ""


*===============================================================================
*=== PROPERTY FACTS
*===============================================================================
import delimited "data\pseudo\PropFacts.txt", delimiter("||", asstring) clear 

drop propertyid
rename bedroomcnt propertyid
rename bathroomcnt bedroomcnt
rename finishedsq bathroomcnt
rename lotsizesquarefeet finishedsq
rename yearbuilt lotsizesquarefeet
rename majorremodelyear yearbuilt
rename housenumber majorremodelyear
rename housenumberfraction housenumber
rename streetdirectionprefix housenumberfraction
rename streetname streetdirectionprefix
rename streetsuffix streetname
rename streetdirectionsuffix streetsuffix
rename unitprefix streetdirectionsuffix
rename unitnumber unitprefix
rename city unitnumber
rename state city
rename postalcode state
rename zipplusfour postalcode
rename v20 zipplusfour

* example FourState_facts file
* code below loads a different file that ensures that the sample merges with listings
save "replication output\FourState_facts", replace


*===============================================================================
*=== FOR-SALE LISTING WORD PAIR COUNTS
*===============================================================================
import delimited "replication output\FourState_FS_listings_word_pairs_count_flags.csv", clear

drop if _n==1

rename v2 word_count
rename v3 pair_count
rename v4 all_words
rename v5 all_pairs

split v1, p(||) gen(original)

rename original2 propertyid
rename original3 postingid 
rename original5 sellingprice
rename original6 listingcreationdate
drop original*
drop v1

destring propertyid, replace
destring postingid, replace
destring sellingprice, replace

drop if missing(propertyid)
drop if missing(postingid)
drop if missing(listingcreationdate)

duplicates drop

compress

order propertyid postingid listingcreationdate word_count

gsort propertyid postingid listingcreationdate -word_count -pair_count

duplicates drop propertyid postingid listingcreationdate, force

merge m:1 propertyid using "data\preprocessed\pseudo\FourState_facts.dta"
keep if _merge==3
drop _merge

save "replication output\FourState_FS_merged", replace

keep if word_count>0 | pair_count>0

save "replication output\FourState_FS_merged_matches", replace


*===============================================================================
*=== Now import original split file to add textblock to matched word file
*===============================================================================
import delimited "data\pseudo\FS_listings.txt", delimiter("||", asstring) bindquote(loose) clear

rename propertyid string_fragments
rename postingid propertyid
rename postingtypeid postingid
rename sellingprice postingtypeid
rename listingcreationdate sellingprice
rename listingdescription listingcreationdate
rename v7 listingdescription

*FIX THE STRINGS FIRST

destring string_fragments, gen(fragment_num) force
replace string_fragments="" if fragment_num!=.
drop fragment_num
replace string_fragments="" if  string_fragments==" "
replace string_fragments="" if  string_fragments=="  "
replace string_fragments="" if  string_fragments=="   "
replace string_fragments="" if  string_fragments=="    "
replace string_fragments="" if  string_fragments=="     "


local previous_ob propertyid!=. 

forvalues i=1/50 {

local previous_obs_list `previous_ob' & propertyid[_n+`i']==.

replace listingdescription = listingdescription + " " + string_fragments[_n+`i'] if string_fragments[_n+`i']!="" & `previous_obs_list'

local previous_ob `previous_obs_list'

}
drop string_fragments

gen stringlength = length(listingdescription)


drop if missing(propertyid)


gsort propertyid postingid listingcreationdate -stringlength

duplicates drop propertyid postingid listingcreationdate, force
isid propertyid postingid listingcreationdate
drop stringlength

merge 1:1 propertyid postingid listingcreationdate using "replication output\FourState_FS_merged_matches"
keep if _merge==3
drop _merge

save  "replication output\FourState_FS_merged_matches_text", replace
erase "replication output\FourState_FS_merged_matches.dta"

